Library imports
from __future__ import print_function
import platform
import sys
import nltk
import requests
import re
from requests import get
from itertools import repeat
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from nltk.stem.snowball import SnowballStemmer
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, ImageColorGenerator
General system information and library versions
import platform; print(platform.platform())
import sys;print('Python', sys.version)
import os;print('OS', os.name)
import bs4;print('Beautiful Soup', bs4.__version__)
#import urllib;print('Urllib', urllib.request.__version__)
import re;print('Regex', re.__version__)
import spacy;print('SpaCy', spacy.__version__)
#import gensim;print('Gensim', gensim.__version__)
import sklearn;print('Sklearn', sklearn.__version__)
import scipy;print('Scipy', scipy.__version__)
import matplotlib;print('Matplotlib', matplotlib.__version__)
print (os.environ['CONDA_DEFAULT_ENV'])
Function to parse the first 25 reviews given a movie link
def movie_review_crawler(url):
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
rev_containers = html_soup.find_all('div', class_ = 'text show-more__control')
reviews = []
for rv in rev_containers:
reviews.append(rv.text)
return reviews
Function to create a list of 100 reviews of science fiction movies Films used: (Matrix, Inception, Sunshine, Dragonball)
def my_review_crawler():
matrix = movie_review_crawler("https://www.imdb.com/title/tt0133093/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=10")
matrix2 = movie_review_crawler("https://www.imdb.com/title/tt0133093/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=2")
inception = movie_review_crawler("https://www.imdb.com/title/tt1375666/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=10")
inception2 = movie_review_crawler("https://www.imdb.com/title/tt1375666/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=2")
sunshine = movie_review_crawler("https://www.imdb.com/title/tt0448134/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=9")
sunshine2 = movie_review_crawler("https://www.imdb.com/title/tt0448134/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=2")
dragonball = movie_review_crawler("https://www.imdb.com/title/tt1098327/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=8")
dragonball2 = movie_review_crawler("https://www.imdb.com/title/tt1098327/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=1")
scifi_reviews = matrix + inception + sunshine + dragonball + matrix2 + inception2 + sunshine2 + dragonball2
return scifi_reviews
Fetch and place all reviews on list called reviews
reviews = my_review_crawler()
Inspect the first 10 reviews collected
reviews[:10][:50]
Build custom stop word list
from sklearn.feature_extraction import text
extra_stop_words = ["film", "movie", "just", "going", "story", "goku", "nolan", "piccolo", "james", "dicaprio",
"series", "cartoon", "know", "going", "does", "mal" , "didn", "actually", "neo", "cobbs", "boyle", "icarus",
"make", "things", "page", "job", "haven", "say", "don", "does", "matrix", "sunshine", "dragonball", "inception",
"movies", "christopher", "gordon-levitt", "joseph", "yamcha", "roshi", "cobb", "michael", "caine", "ellen", "saito",
"ariadne", "murphy", "cillian", "dragon", "ball", "hugo", "weaving", "keanu", "reeves", "danny", "hey"]
stopwords = text.ENGLISH_STOP_WORDS.union(extra_stop_words)
Create vectors and clean up data (keep letters and remove stop words
vectorizer = CountVectorizer(min_df=5, max_df=0.9,
stop_words=stopwords, lowercase=True,
token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(reviews)
Information on generated data vector
data_vectorized
We will create 6 topics. Goal of our analysis is to categorize all reviews in categories than can cover most of the distance map.
NUM_TOPICS = 6
Build a Latent Dirichlet Allocation Model
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=20, learning_method='online', random_state=41)
lda_Z = lda_model.fit_transform(data_vectorized)
print("LDA Model:")
print("Shape: ",lda_Z.shape)
print("Investigate the weights of the first corpus document in each topic space:")
print(lda_Z[0])
Function to assist us in inspecting the inferred topics
def print_topics(model, vectorizer, top_n=15):
for idx, topic in enumerate(model.components_):
print("Topic %d:" % (idx))
print([(vectorizer.get_feature_names()[i], topic[i])
for i in topic.argsort()[:-top_n - 1:-1]])
Inspect LDA model inferred topics
print("LDA Model Topics:")
print_topics(lda_model, vectorizer)
Testing with an unknown document
test = "Watching The Matrix Reloaded, one is absolutely entitled to say that it is overloaded, too lengthy action sequences for instance, and indeed, a way too lengthy dancing scene in Zion. But next to that, it is obvious that this sequal to The Matrix (1999)takes the story to a whole new dimension. Different characters define the working of the matrix, and the meaning of life itself, in different ways, depending on their onthological background. A conclusion is not (yet) given, which adds to the movie a kind of postmodern quality. For as far as the action sequences are concerned: Groundbreaking. You'll see stuff that you've never seen before. Sometimes the scenes are a little lengthy, which harmes the narrative, but that is compensated easily by the visual spectacle. And yes, the Architect at the end is difficult to understand, but when you watch the film more than once, you'll find out that it does make sense what he says. All together this movie may not be as fantastic as 'The Matrix', but it is definitely a good movie that will keep you thinking for a while."
x = lda_model.transform(vectorizer.transform([test]))[0]
print(x)
Build a Non-Negative Matrix Factorization Model
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS, random_state=77)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print("NMF Model:")
print("Shape: ",nmf_Z.shape)
print("Investigate the weights of the first corpus document in each topic space:")
print(nmf_Z[0])
Inspect NMF model inferred topics
print("NMF Model Topics:")
print_topics(nmf_model, vectorizer)
test = "Watching The Matrix Reloaded, one is absolutely entitled to say that it is overloaded, too lengthy action sequences for instance, and indeed, a way too lengthy dancing scene in Zion. But next to that, it is obvious that this sequal to The Matrix (1999)takes the story to a whole new dimension. Different characters define the working of the matrix, and the meaning of life itself, in different ways, depending on their onthological background. A conclusion is not (yet) given, which adds to the movie a kind of postmodern quality. For as far as the action sequences are concerned: Groundbreaking. You'll see stuff that you've never seen before. Sometimes the scenes are a little lengthy, which harmes the narrative, but that is compensated easily by the visual spectacle. And yes, the Architect at the end is difficult to understand, but when you watch the film more than once, you'll find out that it does make sense what he says. All together this movie may not be as fantastic as 'The Matrix', but it is definitely a good movie that will keep you thinking for a while."
x = nmf_model.transform(vectorizer.transform([test]))[0]
print(x)
Visualization for LDA Model using pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel
Visualization for NMF Model using pyLDAvis
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(nmf_model, data_vectorized, vectorizer, mds='tsne')
panel
As we expected NMF provided the best results. As we can see from the distance map above, the topics generated from NMF are almost equally weighted, are clearly seperated and cover up the majority of the plane. Finally looking at the top topic words, we can clearly see that NMF produced better and more unique topis. LDA performed worse since the topic generated had variable weights, some of them were very close to eachother and they ended up covering up a smaller space in the plane. This is understandable since LDA requires a large dataset to work efficiently.
LDA topic results
# LDA Results
lda_res = [0,0,0,0,0,0]
lda_res[0] = "Topic 0 - team perfect dangerous world"
lda_res[1] = "Topic 1 - effects action best world science"
lda_res[2] = "Topic 2 - sequence philosophical speed answer"
lda_res[3] = "Topic 3 - dream people like idea mind reality"
lda_res[4] = "Topic 4 - narrative sound visual machine vision"
lda_res[5] = "Topic 5 - like time sun think way action"
print ("LDA Topic Descriptions: ")
lda_res
NMF Topic Results
# NMF Results
nmf_res = [0,0,0,0,0,0]
nmf_res[0] = "Topic 0 - like people action original anime"
nmf_res[1] = "Topic 1 - dream idea mind reality time"
nmf_res[2] = "Topic 2 - sun crew space ship mission earth"
nmf_res[3] = "Topic 3 - best action time think elements"
nmf_res[4] = "Topic 4 - special effects great world care"
nmf_res[5] = "Topic 5 - like bad looks effect work place"
print ("NMF Topic Descriptions: ")
nmf_res
Wordcloud of each review
for i in range(100):
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(reviews[i])
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()